{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sending Requests\n",
    "This section provides a quick-start guide for using RTP-LLM in chat completions after successfully installing RTP-LLM.\n",
    "\n",
    "- For multimodal models, refer to [OpenAI APIs - Vision](./openai_api_vision.ipynb).\n",
    "- For embedding models, refer to [OpenAI APIs - Embedding](../backend/openai_api_embeddings.ipynb) and [Encode (embedding model)](../backend/native_api.html#Encode-(embedding-model))."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Launch A Server"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import socket\n",
    "import subprocess\n",
    "import time\n",
    "import logging\n",
    "import psutil\n",
    "import requests\n",
    "import json\n",
    "from rtp_llm.utils.util import wait_sever_done, stop_server\n",
    "port=8090\n",
    "server_process = subprocess.Popen(\n",
    "        [\"/opt/conda310/bin/python\", \"-m\", \"rtp_llm.start_server\",\n",
    "         \"--checkpoint_path=/mnt/nas1/hf/models--Qwen--Qwen1.5-0.5B-Chat/snapshots/6114e9c18dac0042fa90925f03b046734369472f/\",\n",
    "         \"--model_type=qwen_2\",\n",
    "         f\"--start_port={port}\"\n",
    "         ]\n",
    "    )\n",
    "wait_sever_done(server_process, port)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using cURL\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import subprocess, json\n",
    "port=8090\n",
    "curl_command = f\"\"\"\n",
    "curl -s http://localhost:{port}/v1/chat/completions \\\n",
    "  -H \"Content-Type: application/json\" \\\n",
    "  -d '{\"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}]}'\n",
    "\"\"\"\n",
    "\n",
    "response = json.loads(subprocess.check_output(curl_command, shell=True))\n",
    "print(f\"Output: {response.json()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using Python Requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "port=8090\n",
    "url = f\"http://localhost:{port}/v1/chat/completions\"\n",
    "json_data = {\n",
    "     \"messages\": [\n",
    "          {\n",
    "               \"role\": \"user\",\n",
    "               \"content\": \"What is the capital of France?\"\n",
    "          }\n",
    "     ]\n",
    "}\n",
    "\n",
    "response = requests.post(url, json=json_data)\n",
    "print(f\"Output 0: {response.json()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using OpenAI Python Client\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "port=8090\n",
    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1/chat/completions\", api_key=\"None\")\n",
    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
    "    messages=[\n",
    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
    "    ],\n",
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "port=8090\n",
    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1/chat/completions\", api_key=\"None\")\n",
    "\n",
    "# Use stream=True for streaming responses\n",
    "response = client.chat.completions.create(\n",
    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
    "    messages=[\n",
    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
    "    ],\n",
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    "    stream=True,\n",
    ")\n",
    "\n",
    "# Handle the streaming output\n",
    "for chunk in response:\n",
    "    if chunk.choices[0].delta.content:\n",
    "        print(chunk.choices[0].delta.content, end=\"\", flush=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using Native Generation APIs\n",
    "\n",
    "You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import requests\n",
    "port=8090\n",
    "response = requests.post(\n",
    "    f\"http://localhost:{port}/\",\n",
    "    json={\n",
    "        \"prompt\": \"The capital of France is\",\n",
    "        \"generate_config\": {\n",
    "            \"temperature\": 0,\n",
    "            \"max_new_tokens\": 32,\n",
    "        },\n",
    "        \"yield_generator\": False\n",
    "    },\n",
    "    stream=True,\n",
    ")\n",
    "\n",
    "print(f\"Output 0: {response.json()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests, json\n",
    "from typing_extensions import Literal\n",
    "port=8090\n",
    "response = requests.post(\n",
    "    f\"http://localhost:{port}/\",\n",
    "    json={\n",
    "        \"prompt\": \"The capital of France is\",\n",
    "        \"generate_config\": {\n",
    "            \"temperature\": 0,\n",
    "            \"max_new_tokens\": 32,\n",
    "        },\n",
    "        \"yield_generator\": True\n",
    "    },\n",
    "    stream=True,\n",
    ")\n",
    "\n",
    "prev: Literal[0] = 0\n",
    "for chunk in response.iter_lines(decode_unicode=False):\n",
    "    chunk = chunk.decode(\"utf-8\")\n",
    "    if chunk and chunk.startswith(\"data:\"):\n",
    "        if chunk == \"data:[done]\":\n",
    "            break\n",
    "        data = json.loads(chunk[5:].strip(\"\\n\"))\n",
    "        output = data[\"response\"]\n",
    "        print(output[prev:], end=\"\", flush=True)\n",
    "        prev = len(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stop_server(server_process)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.9 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.9"
  },
  "vscode": {
   "interpreter": {
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
